package net.ming616.nlp.extraction.service.impl;

import java.io.IOException;
import java.net.URL;
import java.util.HashMap;
import java.util.Map;
import java.util.Map.Entry;

import net.ming616.nlp.extraction.service.HTMLExtractor;

import org.apache.commons.lang.StringUtils;
import org.apache.log4j.Logger;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Service;

@Service("htmlExtractor")
public class HTMLExtractorImpl implements HTMLExtractor {
	/**
	 * Logger for this class
	 */
	private static final Logger logger = Logger
			.getLogger(HTMLExtractorImpl.class);

	@Value("${html.content.extraction.timeoutMillis}")
	String timeoutMillis = "6000";

	public Map<String, String> getContent(String url,
			Map<String, String> selectors) {
		Map<String, String> results = new HashMap<String, String>();
		Document doc = null;
		try {
			doc = Jsoup.parse(new URL(url), Integer.valueOf(timeoutMillis));
		} catch (IOException e) {
			logger.error(" parse html context error, message is "
					+ e.getMessage());
			return results;
		}
		for (Entry<String, String> entry : selectors.entrySet()) {
			Elements elements = doc.select(entry.getValue());
			StringBuffer buffer = new StringBuffer();
			for (Element element : elements) {
				String text = element.text();
				text = StringUtils.stripToEmpty(text);
				buffer.append(text + " ");
			}
			results.put(entry.getKey(), buffer.toString());
		}
		return results;
	}
}
